# Downloading subtitles
library(dplyr)
library(rvest)
library(utils)
library(downloader)
set.seed(42)
# Loading the dataset
### Available at: https://www.kaggle.com/datasets/ashirwadsangwan/imdb-dataset
df <- read.csv("title.basics.tsv", header = T, sep = "\t", encoding = "UTF-8")
df <- sample_n(df, nrow(df))
head(df)
## tconst titleType primaryTitle
## 1 tt3100548 video Basta che non si sappia in giro
## 2 tt5453150 tvEpisode Episode #1.3160
## 3 tt6716786 tvEpisode Episode #1.15
## 4 tt1026957 tvEpisode Episode #1.8632
## 5 tt3063346 tvEpisode Outdoor Bar, Fire Pit and Creativity Retreat
## 6 tt2752524 tvEpisode Teaching Toward the Future
## originalTitle isAdult startYear endYear
## 1 Basta che non si sappia in giro 1 2009 \\N
## 2 Episode #1.3160 0 1979 \\N
## 3 Episode #1.15 0 2015 \\N
## 4 Episode #1.8632 0 2007 \\N
## 5 Outdoor Bar, Fire Pit and Creativity Retreat 0 2013 \\N
## 6 Teaching Toward the Future 0 2013 \\N
## runtimeMinutes genres
## 1 \\N Adult
## 2 \\N Drama
## 3 \\N Drama,Romance
## 4 \\N Drama,Romance
## 5 21 Reality-TV
## 6 \\N News,Talk-Show
# Dropping rows where the type is not movie or genre is missing
df <- df[df$titleType == "movie" & df$genres != "\\N",]
# Keeping only the primary genre
df$main_genre <- gsub(",.*", "", df$genres)
# Deleting special characters from the beginning of the title
df$primaryTitle <- gsub("[^a-zA-Z0-9 á-űÁ-Ű]", "", df$primaryTitle)
head(df)
## tconst titleType primaryTitle
## 23 tt0258586 movie La foret qui tue
## 28 tt0299057 movie Nayezdniki
## 36 tt2006141 movie 1959
## 68 tt0400230 movie Biography of Mario Vargas Llosa
## 84 tt0338855 movie Chhote Sarkar
## 100 tt0008188 movie The Little American
## originalTitle isAdult startYear endYear
## 23 La foret qui tue 0 1927 \\N
## 28 Nayezdniki 0 1987 \\N
## 36 1959 0 2016 \\N
## 68 Biography of Mario Vargas Llosa 0 2004 \\N
## 84 Chhote Sarkar 0 1996 \\N
## 100 The Little American 0 1917 \\N
## runtimeMinutes genres main_genre
## 23 \\N Drama Drama
## 28 50 Drama Drama
## 36 88 Thriller Thriller
## 68 75 Documentary Documentary
## 84 151 Comedy,Romance,Thriller Comedy
## 100 80 Drama,Romance,War Drama
##### Downloading subtitles from yifysubtitles.org
root <- "https://yifysubtitles.org"
url <- "https://yifysubtitles.org/search?q="
# Wait for some secs while checking temp directory's content
io_wait <- function(max=2){
waited = 0
if(length(list.files("./subtitles/temp/")) == 0 &
waited <= max){
Sys.sleep(0.1)
waited = waited + 0.1
}
}
# Function to get download site and title
get_download_site_and_title <- function(search_url){
download_site <- tryCatch({
subtitle_site <<- read_html(search) %>%
html_node(".media-body") %>%
html_node("a") %>% html_attr("href") %>% paste(root, ., sep = "") %>%
read_html()
# Get title from the site
s_title <<- subtitle_site %>% html_node(".movie-main-title") %>% html_text()
subtitle_site %>% html_nodes(".table.other-subs") %>%
html_nodes("a") %>% html_attr("href") %>% grep("subtitle.*eng", ., value = T) %>% .[1] %>%
paste(root, ., sep="") %>% read_html() %>% html_nodes(".download-subtitle") %>%
html_attr("href") %>% paste(root, ., sep="")
}, error = function(e){NULL})
return(download_site)
}
# Function to download and unzip the subtitle
download_subtitle <- function(download_site, dest, temp_dir, quiet=TRUE){
tryCatch({
download(download_site, dest, quiet)
unzip(dest, exdir = temp_dir)
io_wait()
unzipped <- list.files("./subtitles/temp/", full.names = T)
new_name <- paste("./subtitles/", s_title, ".srt", sep="") %>%
gsub(":|\\|\\?", " ", .)
file.rename(unzipped[1], new_name)
unlink(dest)
unlink(temp_dir, recursive = T)
}, error = function(e){skip <<- T})
}
# Create needed directories
create_dirs <- function(directories){
for (d in directories){
if (!dir.exists(d)){
dir.create(d)
}
}
}
dirs <- c("subtitles", "subtitles/temp")
create_dirs(dirs)
# Downloading subtitles
n <- 1000
for (i in 1:nrow(df)){
if (length(list.files("./subtitles")) >= n){
break}
title <- df$primaryTitle[i]
# print(paste(round(i/nrow(df), 2), title, sep=" "))
search <- URLencode(paste(url, title, sep=""))
skip <- F
download_site <- get_download_site_and_title(search)
if(!is.null(download_site)){
filedest <- paste("./subtitles/", title, ".zip", sep="")
download_subtitle(download_site, filedest, "subtitles/temp")
if (skip){
unlink(filedest)
unlink("./subtitles/temp", recursive = T)
next}
}
}
# Basic sentiment analysis
library(srt)
library(sentimentr)
library(ggplot2)
library(RColorBrewer)
library(reshape2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:sentimentr':
##
## highlight
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
# Get a sample of subtitles
subs <- list.files("subtitles/", full.names = T) %>% sample(1000)
# Reading and converting subtitles to a character vector
subtitle <- read_srt(subs[1])
subtitle <- subtitle$subtitle
title <- gsub("subtitles/|\\.srt|\\(.*", "", subs[1]) %>% stringi::stri_trim()
year <- gsub(".*\\(|\\)\\.srt", "", subs[1])
text <- paste(subtitle, collapse = " ") %>% gsub("\n", " ", .)
# Getting sentiment scores for a subtitle and creating a data frame row from them
get_sentiment_row <- function(text, title, year){
avg_sentiment <- sentiment_by(text)$ave_sentiment
emos <- emotion_by(text)
emos <- reshape(emos[,c("element_id", "emotion_type", "ave_emotion")],
idvar = "element_id",
timevar = "emotion_type", direction = "wide")
names(emos) <- gsub("ave_emotion\\.", "", names(emos))
avg_profanity <- profanity_by(text)$ave_profanity
sentiment_df <- data.frame("title"=title, "year"=year, "avg_sentiment"=avg_sentiment,
emos, "profanity" = avg_profanity)
return(sentiment_df)
}
# Creating original data frame with 1 row
senti_df <- get_sentiment_row(text, title, year=year)
# Deleting path and extension from titles
titles <- gsub("subtitles/|\\.srt|\\(.*", "", subs) %>% stringi::stri_trim()
# Getting year of the movie
years <- gsub(".*\\(|\\)\\.srt", "", subs)
# Appending rows to the data frame if it does not exists as a csv
if (file.exists("sentiment_scores.csv")){
senti_df <- read.csv("sentiment_scores.csv")
} else{
# Creating progress bar
pb <- txtProgressBar(min = 0, max = length(subs), style = 3, width = 50, char = "=")
for (i in 2:length(subs)){
setTxtProgressBar(pb, i)
skip <- F
sub <- subs[i]
title <- titles[i]
year <- years[i]
tryCatch({ # Error handling is needed because some subtitle files are empty
subtitle <- read_srt(sub)
subtitle <- subtitle$subtitle
text <- paste(subtitle, collapse = " ") %>% gsub("\n", " ", .)
row <- get_sentiment_row(text, title, year)
if (rowSums(row[,-c(1:2, 4)]) != 0){
senti_df <- rbind(senti_df, row)}
},error = function(e){skip <<- T})
if (skip){next}
}
close(pb)
write.csv(senti_df, "sentiment_scores.csv", row.names = F)
}
# Changing data type of year column
senti_df$year <- as.integer(senti_df$year)
df$startYear <- as.integer(df$startYear)
head(senti_df)
## title year avg_sentiment element_id anger
## 1 Harlem Nights 1989 0.014638470 1 0.02153982
## 2 Shallow Grave 1987 0.006460613 1 0.01657917
## 3 Love Songs 2007 0.005476916 1 0.01570211
## 4 The Future Is Woman 1984 0.091300115 1 0.01151742
## 5 Ambush at Tomahawk Gap 1953 0.015940097 1 0.01758555
## 6 Night Walk 2019 0.062349143 1 0.01263872
## anger_negated anticipation anticipation_negated disgust
## 1 0.002651055 0.02087706 0.001325527 0.013697117
## 2 0.002348715 0.02238187 0.002072396 0.012987013
## 3 0.002093614 0.02497383 0.003589053 0.013907582
## 4 0.001727613 0.02533832 0.002879355 0.008350130
## 5 0.005703422 0.01568441 0.004039924 0.005703422
## 6 0.002055076 0.02168105 0.002671599 0.007912043
## disgust_negated fear fear_negated joy joy_negated
## 1 0.002540594 0.02098752 0.001767370 0.019661991 0.0009941456
## 2 0.001934236 0.02417795 0.002763194 0.026802984 0.0019342360
## 3 0.001196351 0.01869299 0.001345895 0.025123374 0.0035890534
## 4 0.001439678 0.01526058 0.002015549 0.030521163 0.0017276130
## 5 0.001188213 0.01972433 0.004277567 0.008792776 0.0045152091
## 6 0.001027538 0.01798192 0.002774353 0.018392931 0.0006165228
## sadness sadness_negated surprise surprise_negated trust
## 1 0.01866784 0.001215067 0.013476196 0.0009941456 0.01922015
## 2 0.02998066 0.003177673 0.010223819 0.0011052777 0.02763194
## 3 0.01914162 0.003589053 0.016001196 0.0013458950 0.02153432
## 4 0.01727613 0.003455226 0.015836453 0.0011517420 0.02533832
## 5 0.01473384 0.004039924 0.009743346 0.0054657795 0.01972433
## 6 0.01222770 0.002671599 0.009145088 0.0007192766 0.03185368
## trust_negated profanity
## 1 0.001104606 0.0185573843
## 2 0.003039514 0.0044211108
## 3 0.002990878 0.0040376851
## 4 0.001727613 0.0020155485
## 5 0.006653992 0.0002376426
## 6 0.001541307 0.0083230580
# Joining rating dataset
ratings <- read.csv("title.ratings.tsv", header = T, sep="\t")
df <- inner_join(df,ratings)
## Joining, by = "tconst"
# Column to be used
cols <- c("tconst","primaryTitle", "startYear", "main_genre", "averageRating")
# Joining the two dataframes
senti <- inner_join(senti_df, df[,cols], by=c("title"="primaryTitle", "year"="startYear"))
# Multiplying sentiment scores by 100 for
# senti[, names(senti)[5:21]] <- senti[, names(senti)[5:21]]*100
# Saving data frame as a csv
write.csv(senti, "sentiment_db.csv", row.names = F)
senti <- read.csv("sentiment_db.csv")
names(senti)
## [1] "title" "year" "avg_sentiment"
## [4] "element_id" "anger" "anger_negated"
## [7] "anticipation" "anticipation_negated" "disgust"
## [10] "disgust_negated" "fear" "fear_negated"
## [13] "joy" "joy_negated" "sadness"
## [16] "sadness_negated" "surprise" "surprise_negated"
## [19] "trust" "trust_negated" "profanity"
## [22] "tconst" "main_genre" "averageRating"
# Subsetting genre column to get genres with at least 25 elements
genres <- table(senti$main_genre)[table(senti$main_genre) > 25] %>% names()
senti <- senti[senti$main_genre %in% genres, ]
# Plotting sentiment scores across genres
ggplot(senti[senti$avg_sentiment<0.5, ], aes(avg_sentiment, main_genre, fill=main_genre)) +
geom_boxplot(alpha=1, outlier.shape = NA) +
geom_jitter(aes(size=profanity, colour=surprise), alpha=0.2) +
scale_colour_gradient(low = "white", high = "red")+
coord_flip()+
guides(fill = "none")+
ylab("Genre") + xlab("Sentiment") +
theme(axis.title = element_text(size=20),
axis.text = element_text(size=10),
legend.text = element_text(size=10))

# Creating time series of sentiment scores
ts <- aggregate(cbind(anger,anticipation,disgust,fear,
joy,sadness,surprise,trust,profanity) ~
year, senti, mean)
# Converting to long format
ts <- melt(ts, id="year")
# Plotting time series
tplot <- ggplot(ts[ts$year>1930,], aes(year, value, colour = variable)) +
geom_line()
ggplotly(tplot)